NYC Shootings Reported to Police

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)


#import libraries
#PLEASE NOTE: PLEASE INSTALL BELOW PACKAGES PRIOR TO RUNNING IF NOT PREVIOUSLY INSTALLED
library(tidyverse)
library(lubridate)
library(knitr)
library(scales)
library(forcats)
library(plotly)
library(DT)

#import data to read and store as variable
shooting_data <- read_csv("https://data.cityofnewyork.us/api/views/833y-fsy8/rows.csv")

#update date fields to proper class
shooting_data$OCCUR_DATE <- mdy(shooting_data$OCCUR_DATE)

shooting_data$HOUR <- hour(shooting_data$OCCUR_TIME)

 

Time of Day and Volume of Shootings

# No output in this chunk
# Create dataframe of shootings grouped by BORO and hour they occured
hourly_data <- shooting_data %>%
  group_by(BORO, HOUR) %>%
  summarise(Shootings = n()) %>%
  group_by(BORO) %>%
  mutate(Total_Shootings = sum(Shootings)) %>%
  arrange(desc(Total_Shootings)) %>%
  mutate(BORO = fct_reorder(BORO, Total_Shootings, max),
         PCT_Shootings = Shootings/Total_Shootings,
         Percent_Shootings = percent(PCT_Shootings, 0.01))

What Time of Day Saw the Most Shooting Volume Across the 5 Boroughs?

# Create a plot showing what hour shootings take place
# Use facet wrap to display distinct graphs for the boroughs

line_plot <- hourly_data %>%
  ggplot(aes(x = HOUR, y = PCT_Shootings, label = Percent_Shootings, label2 = BORO)) +
  geom_area(stat = "identity", fill = "darkturquoise") +
  scale_y_continuous(labels = percent) +
  theme_classic() +
  facet_wrap(~ BORO) +
  theme(panel.spacing = unit(2, "lines")) +
  labs(x = NULL, y = NULL, title = "Percentage of Overall Shootings by Hour of Day")

ggplotly(line_plot, tooltip = c("x", "label", "label2"))

 

# Get both maximum and minimum shooting volume totals for borough/hour
# Use stringr library to print nicely in table

min_max_hours <- hourly_data %>%
  group_by(BORO) %>%
  mutate(`Max Shootings` = max(Shootings),
         `Min Shootings` = min(Shootings),
         TIME = ifelse(HOUR == 0, "12 AM", 
                       ifelse(HOUR < 12, str_c(HOUR, " AM"),
                              ifelse(HOUR == 12, "12 PM", str_c(HOUR-12, " PM"))))) %>%
  ungroup() %>%
  filter(Shootings == `Max Shootings` | Shootings == `Min Shootings`) %>%
  mutate(Category = ifelse(Shootings == `Max Shootings`, "Max", "Min"),
         Values = str_c(TIME, " (", Shootings, ")")) %>%
  select(BORO, Values, Category) %>%
  pivot_wider(names_from = BORO, values_from = Values)
  
kable(min_max_hours, caption = "Highest and Lowest Shootings by Hour")
Highest and Lowest Shootings by Hour
Category BROOKLYN BRONX QUEENS MANHATTAN STATEN ISLAND
Min 9 AM (73) 8 AM (48), 9 AM (48) 8 AM (28) 7 AM (20), 9 AM (20) 10 AM (4)
Max 11 PM (817) 12 AM (599) 2 AM (304) 12 AM (270) 4 AM (59)

 

Shootings by Month

# Get shooting volume by borough and month

month_grouped <- shooting_data %>%
  mutate(Month = month(OCCUR_DATE, label = TRUE, abbr = FALSE)) %>%
  group_by(BORO, Month) %>%
  summarise(Shootings = n())

Volume of Shootings by Month and Borough

#Show bar graphs for each borough and shooting volume based on hour

month_bar <- month_grouped %>%
  ggplot(aes(x = fct_rev(Month), y = Shootings, label = Month, label2 = BORO)) +
  geom_bar(stat = "identity", fill = "darkturquoise") +
  theme_classic() +
  labs(x = NULL, y = NULL, 
       title = "NYC Volume of Shootings from 2006-2020 by Month and Borough") +
  coord_flip() +
  facet_wrap(~ BORO, scales = "free", ncol = 2) +
  theme(panel.spacing = unit(3, "lines"))

ggplotly(month_bar, tooltip = c("label", "y", "label2"))

 

# Similar to hourly table get min/max values and format to print nicely in data table

min_max_months <- month_grouped %>%
  group_by(BORO) %>%
  mutate(`Max Shootings` = max(Shootings),
         `Min Shootings` = min(Shootings)) %>%
  ungroup() %>%
  filter(Shootings == `Max Shootings` | Shootings == `Min Shootings`) %>%
  mutate(Category = ifelse(Shootings == `Max Shootings`, "Max", "Min"),
         Values = str_c(Month, " (", Shootings, ")")) %>%
  select(-`Max Shootings`, -`Min Shootings`, -Shootings, -Month) %>%
  pivot_wider(names_from = BORO, values_from = Values)
  
kable(min_max_months, caption = "Highest and Lowest Shootings by Month")
Highest and Lowest Shootings by Month
Category BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
Min February (312) February (444) February (146) February (215) February (32)
Max August (824) July (1260) August (330) August (379) July (84)

Month and Time of Day

# Not analyzing by Borough at this time
# Instead of looking at individual hours/months try and group by time/month ranges


shooting_data <- shooting_data %>%
  mutate(Month_Group = ifelse(month(OCCUR_DATE) < 4, "Jan-Mar", 
                              ifelse(month(OCCUR_DATE) < 7, "Apr-Jun", 
                                     ifelse(month(OCCUR_DATE) < 10, "Jul-Sep", "Oct-Dec"))),
         Hour_Group = ifelse(HOUR < 3 | HOUR > 22, "11PM - 2AM",
                             ifelse(HOUR < 8, "3 - 7AM",
                                    ifelse(HOUR < 13, "8AM - 12PM",
                                           ifelse(HOUR < 18, "1 - 5PM", "6 - 10PM")))))
  
# Create factors for Months and Hours
# Ensure levels are sorted (earliest first in graphs/tables)

shooting_data$Hour_Group <- factor(shooting_data$Hour_Group,
                                levels = c("3 - 7AM", "8AM - 12PM", "1 - 5PM",
                                           "6 - 10PM", "11PM - 2AM"))

shooting_data$Month_Group <- factor(shooting_data$Month_Group,
                                 levels = c("Jan-Mar", "Apr-Jun", "Jul-Sep", "Oct-Dec"))


month_time <- shooting_data %>%
  group_by(Month_Group, Hour_Group) %>%
  summarise(Shootings = n())

Plotting Shooting Volume Across All Boroughs by Month and Time Ranges

# Plot month/hour ranges data

month_time_plot <- month_time %>%
  ggplot(aes(x = Month_Group, y = Shootings, fill = Hour_Group)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Month Range", y = "Volume of Shootings", fill = "Time of Day",
       title = "Shootings Volume by Month and Time of Day") +
  theme_classic() +
  scale_fill_brewer(palette = "Pastel2", direction = -1)

ggplotly(month_time_plot)

 

 

 

Predictive Model for Shootings on Time of Day

# Use similar data based on hour/month ranges to create a predictive model
# Borough not included
# Model Goal: Given the total count of shootings in a given year predict the volume of shootings given a month/hour range combination

hour_model_data <- shooting_data %>%
  mutate(Year = year(OCCUR_DATE)) %>%
  #filter(Year >= 2019) %>%
  group_by(Hour_Group, Month_Group, Year) %>%
  summarise(Shootings = n()) %>%
  group_by(Year) %>%
  mutate(Total_Shootings = sum(Shootings))
    
hour_model <- lm(Shootings ~ Total_Shootings + Hour_Group + Month_Group, data = hour_model_data)

#Print output of model
summary(hour_model)
## 
## Call:
## lm(formula = Shootings ~ Total_Shootings + Hour_Group + Month_Group, 
##     data = hour_model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.193 -17.056  -2.953  11.708 123.007 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -38.273333   7.742803  -4.943 1.30e-06 ***
## Total_Shootings        0.050000   0.004032  12.402  < 2e-16 ***
## Hour_Group8AM - 12PM -42.333333   4.977887  -8.504 9.73e-16 ***
## Hour_Group1 - 5PM     -3.450000   4.977887  -0.693    0.489    
## Hour_Group6 - 10PM    56.383333   4.977887  11.327  < 2e-16 ***
## Hour_Group11PM - 2AM  58.366667   4.977887  11.725  < 2e-16 ***
## Month_GroupApr-Jun    29.680000   4.452358   6.666 1.32e-10 ***
## Month_GroupJul-Sep    49.800000   4.452358  11.185  < 2e-16 ***
## Month_GroupOct-Dec    18.440000   4.452358   4.142 4.52e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.27 on 291 degrees of freedom
## Multiple R-squared:  0.7527, Adjusted R-squared:  0.7459 
## F-statistic: 110.7 on 8 and 291 DF,  p-value: < 2.2e-16
#Add in new column to model dataframe with predictions
hour_model_data$Prediction <- predict(hour_model)

#Create column showing how far off prediction was for each data point
hour_model_data <- hour_model_data %>%
  mutate(Prediction_Difference = Prediction - Shootings)
# Create graphs of models performance
# Due to large data, only printing graphs for certain years in data set


model_graph_data <- hour_model_data %>%
  select(-Total_Shootings,-Prediction_Difference) %>%
  pivot_longer(cols = c(-Hour_Group, -Year, -Month_Group))


model_graph_2008 <- model_graph_data %>%
  filter(Year == 2008) %>%
  ggplot(aes(x = Month_Group, y = value, fill = name)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_classic() +
  labs(x = NULL, y = NULL, fill = "Metric", title = "Predicted Total Shootings 2008") +
  scale_fill_brewer(palette = "Pastel1") +
  facet_wrap(~ Hour_Group, scales = "free") +
  theme(panel.spacing = unit(3, "lines"))

model_graph_2015 <- model_graph_data %>%
  filter(Year == 2015) %>%
  ggplot(aes(x = Month_Group, y = value, fill = name)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_classic() +
  labs(x = NULL, y = NULL, fill = "Metric", title = "Predicted Total Shootings 2015") +
  scale_fill_brewer(palette = "Pastel1") +
  facet_wrap(~ Hour_Group, scales = "free") +
  theme(panel.spacing = unit(3, "lines"))


model_graph_2020 <- model_graph_data %>%
  filter(Year == 2020) %>%
  ggplot(aes(x = Month_Group, y = value, fill = name)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_classic() +
  labs(x = NULL, y = NULL, fill = "Metric", title = "Predicted Total Shootings 2020") +
  scale_fill_brewer(palette = "Pastel1") +
  facet_wrap(~ Hour_Group, scales = "free") +
  theme(panel.spacing = unit(3, "lines"))



ggplotly(model_graph_2008)
ggplotly(model_graph_2015)
ggplotly(model_graph_2020)
#get average prediction difference for reference

pred_diff_avg <- round(mean(abs(hour_model_data$Prediction_Difference)), 2)

   

Average Prediction Value Error (Absolute Value) 19.52

 

#Shrink model data to most recent years and print table
condensed_model_table <- hour_model_data %>%
  select(-Total_Shootings) %>%
  mutate(Prediction = round(Prediction),
         Prediction_Difference = round(Prediction_Difference)) %>%
  filter(Year >= 2015) %>%
  arrange(Hour_Group, Month_Group, desc(Year))

datatable(condensed_model_table, caption = "Model Performance 2015-2020",
          options = list(pageLength = 30))